/*

	This file originates from Van Beveren et al. (2012).
	It constructs a key between CN8 and CN8+ product codes.

*/


cd ${work}1_data


*clear all
*set more off
*capture log close

* Files can be run in stata 10, 11 or 12

****************************** Directory path *******************************************
* SET DIRECTORY HERE
*cd ""
*****************************************************************************************


* Concordance file can be used to generate CN8 concordance for all years between 1988 and 2010
	* if b is the first year, then b1 is the first "effyr" (year in which first relevant changes become effective)
	* when generating a concordance between two consecutive years, only step 1 and 2 of the code below need to be executed.

* Source files to generate concordances: Eurostat ramon server
	* Necessary files: changes in CN8 classification over time + list of CN8 codes in every year
	* Original files: folder "Originals Ramon"
	
* Output files: Folder "output"
	* please note that concordance files are specific to the time period chosen here

	* CHOOSE LOCAL YEARS HERE (BEGIN AND END YEAR)			
		local b = 2002
		local b1 = `b'+1
		local e = 2015
	
*log using "output\CN8_over_time_`b'_`e'", replace


*******************************************************	
*** Step 1: preparation of files: identify mappings ***
*******************************************************

* Need to identify different type of mappings between CN in t and t+1
* i.e. growing or shrinking families respectively.
/*
	insheet using CN_concordances_1988_2010.csv, clear delimiter(;) names
	
* rename vars + procedure to break up codes (spaces need to removed) and put them back together as strings
	* from = year of obsolete code
	* to = year of new code
	* effyr = year in which change becomes effective (effyr = to)
	keep from to obsolete new
	rename to effyr
	drop from
	
	gen temp1=substr(obsolete, 1, 4)
	gen temp2=substr(obsolete, 6, 2)
	gen temp3=substr(obsolete, 9, 2)
	
	drop obsolete
	egen obsolete=concat(temp1 temp2 temp3)
	drop temp1 temp2 temp3
	
	gen temp1=substr(new, 1, 4)
	gen temp2=substr(new, 6, 2)
	gen temp3=substr(new, 9, 2)
	
	drop new
	egen new=concat(temp1 temp2 temp3)
	drop temp1 temp2 temp3
*/

*** MTB: CHANGE IN ORIGINAL CODE: UPDATED LIST OF CODE UPDATES	
	import excel using input\CN_2018_update_of_codes_xls, firstrow clear
	gen effyr=substr(Period,5,8)
	destring effyr, replace
	gen obsolete=subinstr(Origincode," ","",.)
	gen new=subinstr(Destinationcode," ","",.)
	keep effyr obsolete new
	order effyr obsolete new	
	
	
* Identify mappings within each year 
		
	* Identify one-to-many codes:
		* Note: for now, these include many-to-many codes, correction below
		sort effyr obsolete new
		gen temp=1 if obsolete==obsolete[_n-1 ]& new!=new[_n-1]
		sort effyr obsolete
		by effyr obsolete: egen one_many=max(temp)
		replace one_many=0 if one_many==.
		drop temp
	
	
	* Identify many-to-one codes:
		* Note: for now, these include many-to-many codes, correction below
		sort effyr new obsolete
		gen temp=1 if new==new[_n-1 ]& obsolete!=obsolete[_n-1]	
		sort effyr new
		by effyr new: egen many_one=max(temp)
		replace many_one=0 if many_one==.
		drop temp
	
	* identify many-to-many codes:
		* these have to be grouped in the loop below (step 2)
		gen temp=1 if many_one==1 & one_many==1
		sort effyr obsolete
		by effyr obsolete: egen temp2=max(temp)
		sort effyr new
		by effyr new: egen many_many=max(temp2)
		replace many_many=0 if many_many==.
		drop temp temp2
		
	* Identify simple codes:
		gen simple=0
		replace simple=1 if many_one==0 & one_many==0
		
	* Correction one-to-many and many-to-one coding:
		* many-to-many codes need to be taken out.
		replace one_many=0 if many_many==1
		replace many_one=0 if many_many==1
	
save temp, replace

**************************************************************	
*** Step 2: Generate setyr's for changes between t and t-1 ***
**************************************************************
* Loop 1: feedback effects within single year	
* Generate setyr's 
* i.e. assign unique identifier to each mapping
* for many-many mappings between t and t-1 we need to run loop to verify feedback effects

	foreach yr of numlist `b1'/`e' {
                 use temp, clear
                 keep if effyr==`yr'
                 sort effyr one_many many_one
                         * create groupings for one_many:
                         egen tmp=group(obsolete) if one_many==1
         
                 * create groupings for many_one:
                        * Numbering needs to start after numbering for one_many
                         egen tmpno=max(tmp)
                         if tmpno==. {
                                 * if there are no 1-M groups
                                 drop tmpno
                                 gen tmpno=0
                         		}
                                 
                         egen tmp2=group(new) if many_one==1 
                         gen tmp3=tmpno + tmp2 
                         drop tmpno
 
                 * create groupings for many_many:
                         * Numbering needs to start after numbering for many_one
                        egen tmpno=max(tmp3)
                        if tmpno==. {
                                * if there are no M-1 groups
                                drop tmpno
                                egen tmpno=max(tmp)
                                }
                        if tmpno==. {
                                * if there are no 1-M groups
                                drop tmpno
                                gen tmpno=0
 		                        }
                        egen tmp4=group(obsolete) if many_many==1
                                 * alternative: group on new here and start with obsolete in loop below
                        gen tmp5=tmpno + tmp4 
                        drop tmpno
        
                 * for simple codes, assign a setyr to the simple change:
                        egen tmpno=max(tmp5)
                        if tmpno==. {
                              * if there are no M-M groups
                              drop tmpno
                              egen tmpno=max(tmp3)
                               }
                        if tmpno==. {
                              * if there are no M-1 groups
                              drop tmpno
                              egen tmpno=max(tmp)
                               }
                        if tmpno==. {
                              * if there are no 1-M groups
                         drop tmpno
                               gen tmpno=0
                         }
                        egen tmp6=group(obsolete) if simple==1
                        gen tmp7=tmpno + tmp6
                        drop tmpno
  
                        egen tmp8=rowtotal(tmp tmp3 tmp5 tmp7)
                        gen tmp9=.`yr'
                        egen code=concat(tmp8 tmp9)
                        destring code, replace 
                        drop tmp tmp2-tmp9
                save temp_`yr', replace
                }

  
  
* loop to identify feedback effects M-M groupings within each year
	* Loop is based on Pierce and Schott (2012, forthcoming)
        * Explanation of loop:
                * function mod(`zzz', 2) is used to switch even and odd turns in the loop
                * modulus(x,y) = x - y * int(x/y)
                * equal to 0 if zzz=2, 4, 6, 8, ... 
                * different from 0 for zzz=3, 5, 7, ...
                * each time the setyr's are grouped by new or obsolete
                * until there are no more feedback effects
         
foreach yr of numlist `b1'/`e' {
         
         use temp_`yr', clear

         bysort new: egen c1 = min(code)
                * assign min setyr by new (we grouped by obsolete above for M-M)
                 local zzz = 2
                 local stop = 0
                 while `stop'==0 {
                 noisily display [`zzz']
                 local zlag = `zzz'-1
                 if mod(`zzz',2)==0 {
                        bysort obsolete: egen c`zzz'= min(c`zlag')
                  }
                 if mod(`zzz',2)~=0 {
                        bysort new: egen c`zzz'= min(c`zlag')
                  }
         
                 compare c`zzz' c`zlag'
                 gen idx = c`zzz'==c`zlag'
                 tab idx
                 local stop = r(r)==1
                 local zzz = `zzz'+1
                 display r(r) " " [`stop']
                 drop idx
                 }
  
        local yyy = `zzz' - 1
        gen setyr = c`yyy'
        drop c1-c`yyy'
 
* Verify coding:

       sort new 
       count if new==new[_n-1] & setyr!=setyr[_n-1]
 		* should equal zero
 		
       sort obsolete
       count if obsolete==obsolete[_n-1] & setyr!=setyr[_n-1]
       	* should equal zero
  
* recode, so setyr are numbered consecutively 
       sort effyr setyr
       egen group=group(setyr)
       gen tmp=.
       egen tmp2=concat(group tmp effyr)
       destring tmp2, gen(setyr2)
       drop setyr tmp tmp2
       rename setyr2 setyr
 
* prepare file for subsequent analysis:
        drop code
        format setyr %9.4fc
        destring new, replace
        destring obsolete, replace
        sort obsolete new effyr 
save temp2_`yr', replace
        rename new new`yr'
        rename obsolete obs`yr'
        rename setyr setyr`yr'
        rename effyr effyr`yr'
        order obs`yr' new`yr'
        sort obs`yr'
                        
save temp_xchain_`yr', replace
}

*** Create file for all years between `b1' and `e' ***
         * we need this later on

local b2 = `b' + 2

use temp2_`b1', clear
        
	foreach x of numlist `b2'/`e' {
            append using temp2_`x'
		save CN_setyr_`b'_`e', replace
 		} 

use CN_setyr_`b'_`e', clear
	sort obsolete new effyr
save CN_setyr_`b'_`e', replace 

* Table 3 of the WP:
	sort effyr obsolete
	table effyr if obsolete!=obsolete[_n-1], c(count obsolete)
	sort effyr new
	table effyr if new!=new[_n-1], c(count new)
	sort effyr setyr
	table effyr if setyr!=setyr[_n-1], c(count setyr)
	table effyr, c(sum simple)

	

*********************************************************	
*** Step 3: Consistent panel over time (procedure PS) ***
*********************************************************	
* This procedure is almost identical to that of Schott and Pierce (2012)
* Loop 2 (news loop PS) 
                
	*** Chains over time: identify them ***
	* Use the yearly concordance files to chain the obs-new matches across years.
	* the goal is to find news from subsequent years that updates news from earlier years
	* the joinby command assumes all possible trees from a given origin are captured
	* in this part we only keep chains (multiple changes affecting same codes in different years)
		* not changes that only affect single years

	
	foreach y of numlist `b1'/`e'{
	
		use temp_xchain_`y', clear
		rename obs`y' obs
		foreach x of numlist `b1'/`e' {
			if `x'>`y' {
				noisily display [`y'] " " [`x']
				rename new`y' obs`x'
				sort obs`x'
				joinby obs`x' using temp_xchain_`x', unmatched(master)
				noisily tab _merge
				drop if _merge==2
				rename _merge _m`y'`x'
				rename obs`x' new`y'
			}
		}
		
		
		gen _mjunk=0
		egen idx = rowmax(_m*)
			* more than one column, one for each year
			* e.g. x=1999, y=2005: change can occur in any year
			* if _m==3 in any year, we keep the row 	
		noisily tab idx
		keep if idx==3
		sort obs
		drop _m*
		save temp2_xchain_`y', replace
	}
	* Note: files temp2_xchain_`y' contain only those codes that have undergone changes in subsequent years
	

	*** Assign single setyear to all members of a family ***
	* Put all the chained changes in one file and assign single setyr to all members of a family revealed by the chain.
	* challenge here is to set a single setyr for all families revealed by the chain.
	* Two cases for a family: growing or shrinking
		
	* the iteration of min commands in the loop below takes care of both cases by searching for the setyr for
	* a family that covers all of its members.

	use temp2_xchain_`b1', clear
		foreach y of numlist `b2'/`e'{
			append using temp2_xchain_`y'
			}

	keep obs new* setyr* effyr*
	
	capture duplicates drop
	egen double setyr = rowmin(setyr*)
	egen nchain = rownonmiss(new*)
	
	rename obs obsolete
	order obs setyr
	sort obs
	save temp2_xchain, replace
		/* This file groups all families that change over time */
		/* It shows all (obsolete) codes with their linked codes in later years 
			If there is more than one change over time, the linked codes will have different setyr's
			This is addressed by the reshape command and loop below
			*/
	
	use temp2_xchain, clear
		order obs setyr
		sort obs
		drop setyr effyr*
		egen t1 = seq(), by(obs)
			* assigns number to the line by obsolete code
			* e.g. if the obsolete code maps into at the most three new codes in some year
			* t1 will equal one for the first line, 2 for the second and 3 for the third
			
		reshape long new setyr, i(obs t1) j(effyr)
			* links obsolete codes to new ones, effyr is variable, data in long format
		drop if new==. & setyr==.
		drop t1 nchain
		duplicates drop obs effyr new setyr, force
			* file with all obsolete and new codes chained over time, no unique setyr yet
		
	*** merge chains with non-chained changes from original concordance file ***	
	* Now add back in the obs-new observations that are not part of chains (from step 2)
	* have to add these in before the min loop below in case a non-chain obs-pair is part of a family
		sort obsolete new effyr
		merge obsolete new effyr using CN_setyr_`b'_`e'
		drop if effyr<`b1'| effyr >`e'
		tab _merge
		drop _merge
		
	*** Family identification loop over time
	* Example:
	* 1999: three codes become two codes in 2000 and these two codes each map into two different 
	* codes in 2001, some of which might already have existed in 1999.
	* all these codes are already linked together, but we need to assign unique setyr to all of them.

	egen double t1 = min(setyr), by(obs)
		* groups obsolete codes over time
			
	rename setyr oldsetyr
	local zzz = 2
	local stop = 0
	while `stop'==0 {
		noisily display [`zzz']
		local zlag = `zzz'-1
		if mod(`zzz',2)==0 {
			egen double t`zzz'= min(t`zlag'), by(new)
		}
	
		if mod(`zzz',2)~=0 {
			egen double t`zzz'= min(t`zlag'), by(obs)
		}
	
		compare t`zzz' t`zlag'
		gen idx = t`zzz'==t`zlag'
		tab idx
			/* idx is equal to one if t1=t2, zero otherwise (two lines in the tab) */
		local stop = r(r)==1
			/* r(r) is the no of rows in the tab, equal to one if t1 is always equal to t2, the loop can be ended */
		local zzz = `zzz'+1
		display r(r) " " [`stop']
		drop idx
		}
	

	
	local yyy = `zzz'-1
	gen double setyr = t`yyy'
	
	keep obs effyr new setyr
	duplicates drop
	sort obsolete new effyr
	format setyr`y' %9.4fc
	

save  output_file_CN_`b'_`e'_setyr, replace
!erase temp*.dta 


*********************************************************	
*** Step 4: Consistent panel over time (procedure PS) ***
*********************************************************	
*** Generate concordance file with ALL CN codes between `b' and `e' ***
	* including codes that have not changed
	* procedure draws again on Pierce and Schott (2012), but only partially
	* DIFFERENCE compared to their procedure: PS develop concordance specific to US trade data
	* We develop a concordance for all existing CN8 codes (generic for all countries + necessary to match trade and production)

foreach y of numlist `b'/`e' {


	* generate yearly files with all CN codes in each of the years between `b' and `e'
	* names of the files:  CN_`y' (originals: CN_`y'_ramon.csv)
	* merging the concordance with trade data at firm-product-country-year level is done separately.

	
	local ylead = `y'+1
	noisily display " "
	noisily display " "
	noisily display "NEW LOOP " [`y']
	noisily display " "
	noisily display " "

	* Step 1
	* get obsolete-new files ready
	* temp_obsolete is used to assign setyrs to codes that are last used in year y
	* To insure against the code ever becomming obsolete, 
	* i.e., it being an obsolete code in any year after the year of the loop

	use output_file_CN_`b'_`e'_setyr, clear
		keep if effyr>=`ylead'
		keep obsolete setyr
		drop if obsolete==.
		capture duplicates drop
		sort obsolete
	save temp_obsolete_`y', replace

	* Step 2
	* temp_new is used to assign setyrs to codes that are new in year y
	* bascially want to insure against this code ever having been a new code prior to this
	* year; if so, need to assign it a setyr
	
	use output_file_CN_`b'_`e'_setyr, clear
		keep if effyr<=`ylead'
		keep new setyr
		drop if new==.
		duplicates drop
		sort new
	save temp_new_`y', replace

	* Step 3
	* read in data and collapse to appropriate level
	
	use input\CN_`y', clear
		drop if group==.
		sort group
		
	format group %15.0f

	* merge in obsolete-code family identifiers
		rename group obsolete
		sort obsolete
		merge obsolete using temp_obsolete_`y', keep(setyr)
		noisily tab _merge
		drop if _merge==2
		drop _merge
		rename obsolete group

	* merge in new-code family identifiers
		rename group new
		sort new
		merge new using temp_new_`y', keep(setyr) update
		noisily tab _merge
		drop if _merge==2
		drop _merge
		rename new group

	save CN_`y'_concorded, replace

	
}


	* Step 4: Create panel file
	foreach y of numlist `b'/`e' {
		use CN_`y'_concorded, replace
		rename group group`y'
		drop if setyr==.
		sort setyr group`y'
		save junk_x_`y', replace
		}

	use junk_x_`b', replace
	foreach y of numlist `b1'/`e' {
		display [`y']
		merge setyr using junk_x_`y'
		tab _merge
		drop _merge
		order setyr
		sort setyr group`y'
		}


	foreach y of numlist `b'/`e' {
		egen i`y'= tag(setyr group`y')
		replace group`y'=. if i`y'==0
		drop i`y'
		}
		* command above drops doubles per year
			* if a group in a year features twice with the same setyr, it is reset to missing
	
	keep setyr group* 
	save "output\setyr_CN_`b'_`e'", replace
			
* generate concordance for each year (all codes and groups):
	* + merge with list of all CN8 codes in each year
	* will allow for identification of errors in coding or special codes not in list of CN8 codes
	
		foreach y of numlist `b'/`e' {
			use "output\setyr_CN_`b'_`e'", clear
			keep group`y' setyr 
			rename group`y' group
			sort group
			drop if group==.
			
			save setyr_CN_`y', replace
			
		use input\CN_`y', clear
			sort group
			merge group using setyr_CN_`y'
			tab _m
				* _m==2 should not occur
			rename group cn8
			rename setyr cn8plus
			keep cn8 cn8plus
			duplicates drop
			sort cn8
			count if cn8==cn8[_n-1]
				* should not occur
			gen year = `y'
			save temp_`y', replace
			}
			
		use temp_`b', clear
		forval y = `b1' / `e' {
			append using temp_`y'
			}
		sort cn8 year
		gen synthetic = (cn8plus!=.)
		replace cn8plus = cn8 if cn8plus==.
	save "output\cn8_cn8plus_`b'_`e'", replace
	outsheet using "output\cn8_cn8plus_`b'_`e'.csv", replace
		
***************************************************
*** Step 4: Concording international trade data ***
***************************************************	
/* Procedure:
		A/ Read in international trade data for the period `b' - `e'
		B/ Rename the variable recording the original (unconcorded) CN8 products as "cn8"
		C/ Make sure the cn8 variable is numeric (leading zero dropped) and sort on "cn8"
		D/ Perform a many-one merge on "cn8" with the file "CN8plus_all_`b'_`e'"
		E/ In principle, all CN8 codes that appear in the data should feature in the concordance
			* If some CN8 codes do not feature in concordance, this could be due to reporting errors or "residual" categories
			* Unmatched CN8 codes need to be dropped (no corresponding CN8+ code)
		F/ Aggregate the data from CN8 product level to CN8+ level (if data are at firm-product level, aggregate from firm-CN8 to firm-CN8+)
*/		

/* Example (Using international trade data for Belgium at the product-year level) */
	/*	
			use goods_pt_`b'_`e', clear
				* read in data and label vars
				
				label var valueII "Instrastat import value"
				label var valueIE "Extrastat import value"
				label var valueXI "Intrastat export value"
				label var valueXE "Extrastat import value"
				
				describe
				table year, c(sum valueII sum valueIE sum valueXI sum valueII) format(%20.2fc)
			
		* merge annual CN8 codes with their corresponding CN8+ code
		
			destring cn8, replace
			sort cn8 year
					
			merge cn8 year using "output\cn8_cn8plus_`b'_`e'"
			tab _m
			table cn8 if _m==1, c(sum valueII sum valueIE sum valueXE sum valueXI) format(%20.2fc)
			keep if _m==3
			replace cn8plus = cn8 if cn8plus==.
						
			tab year
			sort year cn8plus
			collapse (sum) valueII valueIE valueXI valueXE, by(year cn8plus)
			tab year 
			save goods_pt_concorded_`b'_`e', replace
			
			table year, c(sum valueII sum valueIE sum valueXI sum valueII) format(%20.2fc)
		*/	
		
				
	
*log close

erase CN_setyr_`b'_`e'.dta
erase output_file_CN_`b'_`e'_setyr.dta


!erase temp.dta

forval x=`b'/`e' {
	erase CN_`x'_concorded.dta
	erase junk_x_`x'.dta
	erase setyr_CN_`x'.dta
	erase temp_new_`x'.dta
	erase temp_obsolete_`x'.dta
	erase temp_`x'.dta
	}
	
forval x=`b1'/`e' {
	!erase temp2_xchain_`x'.dta
	!erase temp_xchain_`x'.dta
	!erase temp2_`x'.dta
	!erase temp_`x'.dta
	}
erase "output\setyr_CN_`b'_`e'.dta"
